import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import ScalarFormatter
import altair as alt
# set the default renderer to vega
alt.data_transformers.enable("vegafusion")
# Set display format for floating-point numbers
pd.options.display.float_format = '{:,.2f}'.format

# read from a csv file into a pd dataframe
df_ranking = pd.read_csv('~/Downloads/bi_ranking.csv', low_memory=False)
# the companias string 
df_ids = pd.read_csv('~/Downloads/bi_compania.csv', low_memory=False)

# segementos
df_segmentos = pd.read_csv('~/Downloads/bi_segmento.csv', low_memory=False)

# Código de Clasificacón Industrial Internacional Unifrome
df_ciiu = pd.read_csv('~/Downloads/bi_ciiu.csv', low_memory=False)
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[1], line 13
     10 pd.options.display.float_format = '{:,.2f}'.format
     12 # read from a csv file into a pd dataframe
---> 13 df_ranking = pd.read_csv('~/Downloads/bi_ranking.csv', low_memory=False)
     14 # the companias string 
     15 df_ids = pd.read_csv('~/Downloads/bi_compania.csv', low_memory=False)

File ~/.local/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
   1013 kwds_defaults = _refine_defaults_read(
   1014     dialect,
   1015     delimiter,
   (...)
   1022     dtype_backend=dtype_backend,
   1023 )
   1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)

File ~/.local/lib/python3.12/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
    617 _validate_names(kwds.get("names", None))
    619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
    622 if chunksize or iterator:
    623     return parser

File ~/.local/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
   1617     self.options["has_index_names"] = kwds["has_index_names"]
   1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)

File ~/.local/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
   1878     if "b" not in mode:
   1879         mode += "b"
-> 1880 self.handles = get_handle(
   1881     f,
   1882     mode,
   1883     encoding=self.options.get("encoding", None),
   1884     compression=self.options.get("compression", None),
   1885     memory_map=self.options.get("memory_map", False),
   1886     is_text=is_text,
   1887     errors=self.options.get("encoding_errors", "strict"),
   1888     storage_options=self.options.get("storage_options", None),
   1889 )
   1890 assert self.handles is not None
   1891 f = self.handles.handle

File ~/.local/lib/python3.12/site-packages/pandas/io/common.py:873, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    868 elif isinstance(handle, str):
    869     # Check whether the filename is to be opened in binary mode.
    870     # Binary mode does not support 'encoding' and 'newline'.
    871     if ioargs.encoding and "b" not in ioargs.mode:
    872         # Encoding
--> 873         handle = open(
    874             handle,
    875             ioargs.mode,
    876             encoding=ioargs.encoding,
    877             errors=errors,
    878             newline="",
    879         )
    880     else:
    881         # Binary mode
    882         handle = open(handle, ioargs.mode)

FileNotFoundError: [Errno 2] No such file or directory: '/home/terac/Downloads/bi_ranking.csv'

Let’s only get the year 2023 and make the passivos by subtracting the patrimonio from the activos#

let’s merge all of the df into one#

# get the anio with 2023
df_ranking = df_ranking[df_ranking['anio'] == 2023]

# rename the cuii column 
df_ciiu = df_ciiu.rename(columns={'descripcion': 'ciiu_desc', 'ciiu': 'ciiu_code'}) 
df_ciiu['ciiu_code'] = df_ciiu['ciiu_code'].str.strip()

# match all of the expedientes in the df_ids with the expedientes in the df_ranking 
df = pd.merge(df_ranking, df_ids, on='expediente', how='left')

# let's merge the ciiu and the segments with the 
df = pd.merge(df, df_ciiu, left_on='ciiu_n1', right_on='ciiu_code', how='left')
df.rename(columns={'ciiu_code': 'ciiu_n1_code', 'ciiu_desc': 'ciiu_n1_desc'}, inplace=True)
df = pd.merge(df, df_ciiu, left_on='ciiu_n6', right_on='ciiu_code', how='left')
df.rename(columns={'ciiu_code': 'ciiu_n6_code', 'ciiu_desc': 'ciiu_n6_desc'}, inplace=True)

# get the passivos by subtracting patrimonio from activos
df['pasivos'] = df['activos'] - df['patrimonio']

# check that there are error where activos is not equal to passivos + patrimonio
#df_error = df[df['activos'] != df['pasivos'] + df['patrimonio']]

# get the name and the passivos columns only
#df_error = df_error[['nombre', 'activos', 'pasivos', 'patrimonio']]

#df_error['margin'] = df_error['activos'] - df_error['pasivos'] - df_error['patrimonio']

# df print nu
#df_error.head()

print(df.columns)
Index(['anio', 'expediente', 'posicion_general', 'cia_imvalores',
       'id_estado_financiero', 'ingresos_ventas', 'activos', 'patrimonio',
       'utilidad_an_imp', 'impuesto_renta', 'n_empleados', 'ingresos_totales',
       'utilidad_ejercicio', 'utilidad_neta', 'cod_segmento', 'ciiu_n1',
       'ciiu_n6', 'liquidez_corriente', 'prueba_acida', 'end_activo',
       'end_patrimonial', 'end_activo_fijo', 'end_corto_plazo',
       'end_largo_plazo', 'cobertura_interes', 'apalancamiento',
       'apalancamiento_financiero', 'end_patrimonial_ct',
       'end_patrimonial_nct', 'apalancamiento_c_l_plazo', 'rot_cartera',
       'rot_activo_fijo', 'rot_ventas', 'per_med_cobranza', 'per_med_pago',
       'impac_gasto_a_v', 'impac_carga_finan', 'rent_neta_activo',
       'margen_bruto', 'margen_operacional', 'rent_neta_ventas',
       'rent_ope_patrimonio', 'rent_ope_activo', 'roe', 'roa',
       'fortaleza_patrimonial', 'gastos_financieros', 'gastos_admin_ventas',
       'depreciaciones', 'amortizaciones', 'costos_ventas_prod', 'deuda_total',
       'deuda_total_c_plazo', 'total_gastos', 'ruc', 'nombre', 'tipo',
       'pro_codigo', 'provincia', 'ciiu_n1_code', 'ciiu_n1_desc',
       'ciiu_n6_code', 'ciiu_n6_desc', 'pasivos'],
      dtype='object')

let seperate the companies into categoris by size#

# make companies categories between 0 and 2.5 billion in assets
# small companies: 0 - 500 thousand
# medium companies: 500 thousand - 50 million
# large companies: 50 million - up 

df_small = df[df['activos'] < 500000]

df_medium = df[(df['activos'] >= 500000) & (df['activos'] < 50000000)]

df_large = df[(df['activos'] >= 50000000)]

# get the number of companies in each category and make a pie chart
pie_sizes_df = pd.DataFrame({
    'size': ['Small - 0 - 500k', 'Medium - 500k - 50m', 'Large - 50 - up'],
    'count': [df_small.shape[0], df_medium.shape[0], df_large.shape[0]]
})

pie_df['percentage'] = (pie_df['count'] / pie_df['count'].sum()) * 100

# Define custom colors for each category using named colors
color_scale = alt.Scale(domain=['Small - 0 - 500k', 'Medium - 500k - 50m', 'Large - 50 - up'],
    range=['steelblue', 'orange', 'mediumseagreen' ])

pie_df.head()

pie_chart = alt.Chart(pie_df).mark_arc().encode(
    theta=alt.Theta(field="count", type="quantitative"),
    color=alt.Color(field="size", type="nominal", scale=color_scale),
    tooltip=['size', 'count', alt.Tooltip('percentage:Q', format='.2f')]
).interactive()

pie_chart.display()
type_counts = df['tipo'].value_counts().reset_index()
type_counts.columns = ['tipo', 'count']
type_counts['percentage'] = (type_counts['count'] / type_counts['count'].sum()) * 100

alt.Chart(type_counts).mark_arc().encode(
    theta=alt.Theta(field="count", type="quantitative"),
    color=alt.Color(field="tipo", type="nominal"),
    tooltip=['tipo', 'count', alt.Tooltip('percentage:Q', format='.2f')]
    # show percentge in legentd
    
).interactive().display()
def make_chart(df, title='Activos de Companias', max_bins=100, color='steelblue', width=450, height=300):
    # Create a selection
    click = alt.selection_point(encodings=['x'])
    
    # Create the right bar chart
    bars = alt.Chart(df).mark_bar().encode(
        x=alt.X('activos:Q', bin=alt.Bin(maxbins=max_bins), title='Activos'),
        y=alt.Y( 'count()', title='Numero de companias'),
        color=alt.condition(click, alt.ColorValue(color), alt.ColorValue(color)),
        tooltip=['count():Q']
    ).add_params(
        click
    ).properties(
        width=width,
        height=height,
        title=title,
    ).interactive()

    # Create the left chart (scatter plot)
    scatter = alt.Chart(df).mark_circle().encode(
        x='pasivos:Q',
        y='activos:Q',
        color='ciiu_n1_desc:N',
        tooltip=['nombre:N', 'activos:Q', 'pasivos:Q', 'patrimonio:Q', 'ciiu_n1_desc:N']
    ).transform_filter(
        click
    ).properties(
        width=width,
        height=height,
        title=''
    ).interactive()
    
    # Filter the left chart based on selection
    filtered_scatter = scatter.transform_filter(
        click
    )
    
    # Combine the charts
    chart = alt.hconcat(bars, filtered_scatter).resolve_legend(
        color="independent",
    )
    chart.show()
make_chart(df_small, title='Activos de Companias pequenas', color='steelblue')
make_chart(df_medium, title='Activos de Companias Medianas', color='orange')
make_chart(df_large, title='Activos de Companias Grandes', color='mediumseagreen')